##  genf.pl
##
##  program to convert any number of Lexis-Nexis downloads into a format used by the MID-SVM classification
##  system.
##
##   Redistribution and use in source and binary forms, with or without modification,
##   are permitted under the terms of the GNU General Public License:
##   http://www.opensource.org/licenses/gpl-license.html
##
##  Report bugs to: schrodt@psu.edu
##
##  Programming supported by National Science Foundation Political Science Program Grant
##  SES-0719634 "Improving the Efficiency of Militarized Interstate Dispute Data Collection using
##  Automated Textual Analysis" and SES-0924240, "MID4: Updating the Militarized Dispute Data Set
##  2002-2010."
##
##  For plausible indenting of this source code, set the tab size in your editor to "2"
##
##  REVISION HISTORY:
##  Original code base appears to have been one or more of the various Lexis-Nexis filter
##  maintained by the KEDS project, http://eventdata.psu.edu
##  Sep-08:  Initial MID-SVM version for 1999 and 2001 downloads by Vladimir Petroff, University of Kansas
##  Feb-10: Assorted modification to work with the 2002 download by Schrodt
##

## Last Updated September 5, 2010
## Latest Programmer: Vito D'Orazio

## Notes: DPA formatting changes in 2006 and becomes identical to other sources.  Prior to 2006, this program
## will NOT work as intended with DPA.  Finding the headline and accurately recording it is a very important
## part of this program, primarily because at the end of all the processing, coders will read every single
## headline but not every single news story.  Coding-by-headline is unavoidable, and therefore the best we
## can do is provide the coders the correct headlines.

## List of compatible news sources by Lexis-Nexis Codes or Names:
## BBC, AP, New York Times, CNN, UPI, TASS, AFX, AFP, Xinhua, The Times London,
## DPA, The Gazette Montreal, Japan Economic Newswire, Interfax, Jerusalem Post

## List of subroutines: Check, Nexts, Filter


#!/usr/local/bin/perl

# ======== globals =========== #

$docs="DOCUMENT"; #the doc tag delimiting
$opinions = "SECTION: NEWS; Opinion;"; #opinions we don't need
$out = "./out/"; #destination for output files

$file_list  =  "./files.list"; #     # list of files to be processed in ascii folder
#$outfile    = $out.'nnfoutfile.txt';   # output file; this will be renamed
$outfile    = 'nnfoutfile.txt';   # output file; this will be renamed

%month_number  = (  # hash used to translate dates
 Jan => '01', Feb => '02', Mar => '03', Apr => '04', May => '05', Jun => '06',
 Jul => '07', Aug => '08', Sep => '09', Oct => '10', Nov => '11', Dec => '12',
);

$skip='skip'; #file contains what we are skipping from input
$filt=0; #how many lines we are filtering
$storyN=0; #not stories initially in outputfile, counts how many per input file
$kfile=0; #number of input files processed
$body=""; #text between a DOCS and LANGUAGE tags


# ======== subroutines =========== #

#checks to see if string contains any terms to be filtered
#filtered content is printed to skip file; otherwise print in the output file
#return 0 if ok to print to output
#       1 otherwise
sub Check {
  if ($_[0] =~ m/Books/i) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/Sports/i) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/business/i) {print FSKIP "Skipped ".$_[0]."\n";$filt++;return 1;}
  if ($_[0] =~ m/financial/i) {print FSKIP "Skipped ".$_[0]."\n";$filt++;return 1;}
  if ($_[0] =~ m/Trade talks/i) {print FSKIP "Skipped ".$_[0]."\n";$filt++;return 1;}
  if ($_[0] =~ m/$opinions/) {print FSKIP "Skipped ".$_[0]."\n";$filt++;return 1;}
  if ($_[0] =~ m/SECTION: Financial News/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: SECTION 4, INSIDE SPORT;/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: Features/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: Singapore News/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: SPORT/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: Sport/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: TEMPO; Something Personal;/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: BUSINESS AND FINANCE;/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: MONEY;/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: NEWS; Features/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: FASHION/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: Business/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: Home news/i) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: FEMAIL/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: BUSINESS THIS WEEK/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: NEWS AND FEATURES; Agenda/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: ENTERTAINMENT/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: UNDEFINED/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: Entertainment, Television, and Culture/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: MONITOR/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: SPECTRUM/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: NEWS AND FEATURES/i) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: NEWS; Arts/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;} #cd4
  if ($_[0] =~ m/SECTION: NEWS; Letters;/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: Companies & Markets;/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: SECTION 2, INVESTMENT AND FINANCE;/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/Baseball/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/Review\/Film:/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/Restaurants/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SURFERS' GREAT BREAKS/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECURITIES EXCHANGE/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/CLOSING PRICES/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/STOCK MOVEMENT/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/COMPANY NEWS/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/DEATH NOTICES/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/LETTER: OBITURAY:/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/(\w+) TSE/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/^TSE/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/(\w+) BOND PRICES/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/(\w+) Nasdaq/i) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/Market Summary/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/Nikkei/i) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/FILM/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/THEATER/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/TELEVISION/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/DANCE/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/Tokyo Stock Exchange:/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/closing price list/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/FOOD/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SECTION: FOOD;/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SOURCE: QNP/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/WEDDINGS/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/LETTERS TO THE EDITOR;/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}
  if ($_[0] =~ m/SHOW:/) {print FSKIP "Skipped ".$_[0]."\n"; $filt++;return 1;}

  return 0; #ok for output
}

#omits lines that we don't need
#but text still might be ok
#Check above filters out
sub Nexts {
  if ($_[0] =~ m/LENGTH/) {return 1;}
  if ($_[0] =~ m/report by [A-Z]/) {return 1;}
  if ($_[0] =~ m/BYLINE/) {return 1;}
  if ($_[0] =~ m/SECTION/) {return 1;}
  if ($_[0] =~ m/DATELINE/) {$dateline=$_[0]; return 1;}
  if ($_[0] =~ m/SOURCE/) {return 1;}
  if ($_[0] =~ m/KEYWORD-HIT/) {return 1;}

  return 0; #ok for output
}

sub Filter {
#takes a body between a DOC and language tags, filters irrelevant stuff in skip
#prints relevant news into output file, input--FOUT, body, FSKIP

  # filter flag
  $filtfl=1;

  # used for indexing purposes
  $firsttext = '0';
  $gotHL='0';

  # output per news story
  $headline = "Headline Not Found";
  $dateline = "Dateline Not Found";
  $source = "Source Not Found";
  $byline = "Byline Not Found";
  $section = "Section Not Found";
  $storylength = "Length Not Found";
  $key = ""; # every story gets assigned a key after MATCH finishes
  $text = ""; # the body of the news story


  my @values = split(/\n/,$_[0]);  # note that this ignores consecutive \n...nice...

  MATCH: for($ka=0; $ka < @values; $ka++) {

    $val = $values[$ka];

    # Skipping black lines
    unless ($val =~ m/\S/) {next MATCH;}

    # This slightly time-consuming step allows us to skip all stories containing these expressions
    # Currently trying to skip over all stories that mention Fallujah, Basra, and Anbar

    @grepMe = split(/ /, $val);
    if((grep/Falluja/,@grepMe) || (grep/Fallouj/,@grepMe) || (grep/Falowja/,@grepMe)
      || (grep/Anbar/,@grepMe) || (grep/Basra/,@grepMe) || (grep/Basora/,@grepMe)) {
      ++$grepCount;
      print GREP "@grepMe\n\n";
      return;
    }

    # Check to see if we have a "BYLINE"
    if ($val =~ m/BYLINE:/) {
      $byline = $val;
      next MATCH;
    }

    # Check to see if we have a "SECTION"
    if ($val =~ m/SECTION:/) {
      $section = $val;
      next MATCH;
    }

    # Check to see if we have a "LENGTH"
    if ($val =~ m/LENGTH:/) {
      $storylength = $val;
      next MATCH;
    }

    # Checking to see if we have a "DATELINE"
    if ($val =~ m/DATELINE:/) {
      $dateline = $val;
      next MATCH;
    }

    # The FIRST LINE OF TEXT is NEWS SOURCE
    if (($val =~ m/\w+/) && ($firsttext=='0')) {
       $firsttext = '1';
       $source = $val;
       next MATCH;
    }

    ## Avoiding "Edition" bugs for indexing the headline
    if ($val =~ m/\w+ Edition/ && ($ka < '10')) {
      $gotHL = ($ka+2); # the index of the headline
      next MATCH;
    }

    ## HEADLINE is always 2 lines after DATE, that means one line blank in the middle
    ## $gotHL is activated only after we retrieved the FIRST date
    if($gotHL eq $ka) {
      if($headline eq "Headline Not Found") {$headline = "";}
      $headline = $headline.$val." ";
      $gotHL = ($ka+1); # the index of the line after the first line of headline
      $filtfl=Check($head); # we want to skip certain headlines
      if ($filtfl) {return;} # end the subroutine
      next MATCH;
    }

    ## Checking if the current line matches the DATE format
    if (($val =~ m/(\w+) (\d+), (\d\d\d\d)/)) {
      $monthno = $month_number{substr($1,0,3)}; # convert month to numeric
      if (length($2) == 2) { $dayno = $2;}
      else {$dayno = "0".$2;}
      $newdate = $3.$monthno.$dayno;
      $date=$val;
      $gotHL = ($ka+2); # the index of the headline
      next MATCH;
    }

    ## Establishing the body of the text.
    ## The program rid.pl has been incorporated here.
    if (Check($val)) {return;} # end the subroutine
    $text = $text.$val."\n";
    $text =~ s/^\s+//; #remove leading whites
    $text =~ tr/\x00-\x08//d;  #remove between 0-8 inclusive
    $text =~ tr/\x0B-\x1F//d;   #remover between 11-31
    $text =~ tr/\x80-\xFF//d;   #remove above 128-255

  } # end MATCH loop

  # Assign the story a key
  $key = "$newdate-$srcid-$seqno-$filename-$fileprfx";

  if ($lastdate ne $newdate) {
    print "\n--------- ",$newdate," -------------\n";
    $lastdate = $newdate;
  }
  ++$seqno;

  ## FORMATTING AND PRINTING TO FILE RELEVANT INFORMATION ##

  $headline =~ s/^\s+//; #remove leading whites
  $source =~ s/^\s+//; #remove leading whites
  $dateline =~ s/^\s+//; #remove leading whites
  $date =~ s/^\s+//; #remove leading whites
  $byline =~ s/^\s+//; #remove leading whites
  $storylength =~ s/^\s+//; #remove leading whites
  $section =~ s/^\s+//; #remove leading whites

  chomp($text);
  print "Filter?: $filtfl\n";
  print "Printing story...newdate-$srcid-$seqno-$filename\n";
  print FOUT "---------------------------------------------------------------\n";
  print FOUT "$headline\n";
  print FOUT "$key\n";
  print FOUT "$date\n";
  print FOUT "(c) "."$source\n";   #"--$storyN--$filtfl"."\n";
  print FOUT "$section\n";
  print FOUT "$dateline\n";
  print FOUT "$byline\n";
  print FOUT "$storylength\n\n";
  print FOUT "$text\n\n";

  $storyN++; # count how many stories processed

} # end Filter subroutine

# ======== main program =========== #

$fileprfx = $ARGV[0];  # set final file name prefix
if (length($fileprfx) == 0) {
  print "\aFile name prefix is a required argument! -- please re-run program\n";
  exit;
}

# open input, output, skip file and grepped-stories file
open (FSKIP, ">$fileprfx.$skip")   or die "Can\'t open skipped file $skip; error $!";
open(FDIR,$file_list)         or die "Can\'t open list of input files $file_list; error $!";
open (FOUT, ">$outfile")   or die "Can\'t open output file $outfile; error $!";
open (GREP, ">grepCheck");

# some counts
$grepCount = 0;
$kf = 0;

FILELIST: while ($filename = <FDIR>) {
   print "\n==========================\nProcessing $filename\n";

  $seqno = '0001';
  chomp($filename);
  open(INFILE,"<$filename")  or die "Can\'t open input file $fname; error $!";
  $line = <INFILE>;
  $ke = 0;

  FILE: while (!eof) {

    $line =~ s/^\s+//; #remove leading whites
    if ($line =~ m/\d+ DOCUMENTS/) {  #found a doc tag, extract body
    if (eof) {last;} #check
                     #once we get a doc tag, get the body of the article
                     #LANGUAGE TAG MISSING SOMETIMES, Dec05_1993_LN_NP1.txt, 424 of 1221 DOCUMENTS
    STORY: while ($line = <INFILE>) {
      if ($line =~ m/LANGUAGE:/)     {last;}  # these are various indicators of the end of the story
      if ($line =~ m/SUBJECT:/)       {last;}
      if ($line =~ m/ORGANIZATION:/) {last;}
      if ($line =~ m/GEOGRAPHIC:/)   {last;}
      if ($line =~ m/LOAD-DATE:/)     {last;}
      if ($line =~ m/PUBLICATION-TYPE:/)  {last;}
      if ($line =~ m/DOCUMENT-TYPE:/)  {last;}

      $body.=$line;
    } # end STORY loop

    # take the body and send it to the Filter subroutine
    Filter($body);
    print "Looping...$kfile files, $storyN stories\n";
    $body="";

   } #if found doc

  $line = <INFILE>; #go next line
  print "$line";

  } # end FILE loop

$kfile++; #count how many input files

} # end FILELIST loop

rename($outfile, "$fileprfx.genfed") or die "Can\'t rename output file $outfile; error $!";

close(FOUT);
close(FSKIP);
close(FDIR);
close(GREP);

print "\n\aFinished: $kfile files processed, $storyN stories.\n";
print "Our final numer of skipped stories because of GREP is: $grepCount.\n";
exit;
